home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Aminet 1 (Walnut Creek)
/
Aminet - June 1993 [Walnut Creek].iso
/
usenet
/
sources
/
volume2
/
unix
/
regexp.1
< prev
next >
Wrap
Text File
|
1988-12-06
|
20KB
|
748 lines
Path: xanth!ames!mailrus!ulowell!page
From: page@swan.ulowell.edu (Bob Page)
Newsgroups: comp.sources.amiga
Subject: v02i085: regexp - regular-expression routines, Part01/02
Message-ID: <10484@swan.ulowell.edu>
Date: 5 Dec 88 22:05:51 GMT
Organization: University of Lowell, Computer Science Dept.
Lines: 737
Approved: page@swan.ulowell.edu
Submitted-by: grwalter@watcgl.waterloo.edu
Posting-number: Volume 2, Issue 85
Archive-name: unix/regexp.1
This is a reimplementation of the Unix V8 regexp(3) package. It gives
C programs the ability to use egrep-style regular expressions, and
does it in a much cleaner fashion than the analogous routines in SysV.
[You need this to recompile stevie. ..Bob]
# This is a shell archive.
# Remove everything above and including the cut line.
# Then run the rest of the file through sh.
#----cut here-----cut here-----cut here-----cut here----#
#!/bin/sh
# shar: Shell Archiver
# Run the following text with /bin/sh to create:
# MAKE
# Makefile
# README
# tests
# timer.c
# try.c
# This archive created: Mon Nov 28 19:50:49 1988
cat << \SHAR_EOF > MAKE
lc -M -Rregexp.lib regexp.c regsub.c regerror.c
SHAR_EOF
cat << \SHAR_EOF > Makefile
# Things you might want to put in ENV and LENV:
# -Dvoid=int compilers that don't do void
# -DCHARBITS=0377 compilers that don't do unsigned char
# -DSTATIC=extern compilers that don't like "static foo();" as forward decl
# -DSTRCSPN library does not have strcspn()
# -Dstrchr=index library does not have strchr()
# -DERRAVAIL have utzoo-compatible error() function and friends
ENV=-DSTRCSPN
LENV=-DSTRCSPN
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
TEST=-I.
# Things you might want to put in PROF:
# -Dstatic='/* */' make everything global so profiler can see it.
# -p profiler
PROF=
CFLAGS=-O $(ENV) $(TEST) $(PROF)
LINTFLAGS=$(LENV) $(TEST) -ha
LDFLAGS=-i
OBJ=regexp.o regsub.o
LSRC=regexp.c regsub.c regerror.c
DTR=README dMakefile regexp.3 regexp.h regexp.c regsub.c regerror.c \
regmagic.h try.c timer.c tests
DEST = ..
try: try.o $(OBJ)
cc $(LDFLAGS) try.o $(OBJ) -o try
# Making timer will probably require putting stuff in $(PROF) and then
# recompiling everything; the following is just the final stage.
timer: timer.o $(OBJ)
cc $(LDFLAGS) $(PROF) timer.o $(OBJ) -o timer
timer.o: timer.c timer.t.h
timer.t.h: tests
sed 's/ /","/g;s/\\/&&/g;s/.*/{"&"},/' tests >timer.t.h
# Regression test.
r: try tests
@echo 'No news is good news...'
try <tests
lint: timer.t.h
@echo 'Complaints about multiply-declared regerror() are legit.'
lint $(LINTFLAGS) $(LSRC) try.c
lint $(LINTFLAGS) $(LSRC) timer.c
regexp.o: regexp.c regexp.h regmagic.h
regsub.o: regsub.c regexp.h regmagic.h
clean:
rm -f *.o *.out core mon.out timer.t.h dMakefile dtr try timer
dtr: r makedtr $(DTR)
makedtr $(DTR) >dtr
dMakefile: Makefile
sed '/^L*ENV=/s/ *-DERRAVAIL//' Makefile >dMakefile
mv: $(OBJ) regerror.o
mv $(OBJ) regerror.o $(DEST)
SHAR_EOF
cat << \SHAR_EOF > README
This is a nearly-public-domain reimplementation of the V8 regexp(3) package.
It gives C programs the ability to use egrep-style regular expressions, and
does it in a much cleaner fashion than the analogous routines in SysV.
Copyright (c) 1986 by University of Toronto.
Written by Henry Spencer. Not derived from licensed software.
Permission is granted to anyone to use this software for any
purpose on any computer system, and to redistribute it freely,
subject to the following restrictions:
1. The author is not responsible for the consequences of use of
this software, no matter how awful, even if they arise
from defects in it.
2. The origin of this software must not be misrepresented, either
by explicit claim or by omission.
3. Altered versions must be plainly marked as such, and must not
be misrepresented as being the original software.
Barring a couple of small items in the BUGS list, this implementation is
believed 100% compatible with V8. It should even be binary-compatible,
sort of, since the only fields in a "struct regexp" that other people have
any business touching are declared in exactly the same way at the same
location in the struct (the beginning).
This implementation is *NOT* AT&T/Bell code, and is not derived from licensed
software. Even though U of T is a V8 licensee. This software is based on
a V8 manual page sent to me by Dennis Ritchie (the manual page enclosed
here is a complete rewrite and hence is not covered by AT&T copyright).
The software was nearly complete at the time of arrival of our V8 tape.
I haven't even looked at V8 yet, although a friend elsewhere at U of T has
been kind enough to run a few test programs using the V8 regexp(3) to resolve
a few fine points. I admit to some familiarity with regular-expression
implementations of the past, but the only one that this code traces any
ancestry to is the one published in Kernighan & Plauger (from which this
one draws ideas but not code).
Simplistically: put this stuff into a source directory, copy regexp.h into
/usr/include, inspect Makefile for compilation options that need changing
to suit your local environment, and then do "make r". This compiles the
regexp(3) functions, compiles a test program, and runs a large set of
regression tests. If there are no complaints, then put regexp.o, regsub.o,
and regerror.o into your C library, and regexp.3 into your manual-pages
directory.
Note that if you don't put regexp.h into /usr/include *before* compiling,
you'll have to add "-I." to CFLAGS before compiling.
The files are:
Makefile instructions to make everything
regexp.3 manual page
regexp.h header file, for /usr/include
regexp.c source for regcomp() and regexec()
regsub.c source for regsub()
regerror.c source for default regerror()
regmagic.h internal header file
try.c source for test program
timer.c source for timing program
tests test list for try and timer
This implementation uses nondeterministic automata rather than the
deterministic ones found in some other implementations, which makes it
simpler, smaller, and faster at compiling regular expressions, but slower
at executing them. In theory, anyway. This implementation does employ
some special-case optimizations to make the simpler cases (which do make
up the bulk of regular expressions actually used) run quickly. In general,
if you want blazing speed you're in the wrong place. Replacing the insides
of egrep with this stuff is probably a mistake; if you want your own egrep
you're going to have to do a lot more work. But if you want to use regular
expressions a little bit in something else, you're in luck. Note that many
existing text editors use nondeterministic regular-expression implementations,
so you're in good company.
This stuff should be pretty portable, given appropriate option settings.
If your chars have less than 8 bits, you're going to have to change the
internal representation of the automaton, although knowledge of the details
of this is fairly localized. There are no "reserved" char values except for
NUL, and no special significance is attached to the top bit of chars.
The string(3) functions are used a fair bit, on the grounds that they are
probably faster than coding the operations in line. Some attempts at code
tuning have been made, but this is invariably a bit machine-specific.
SHAR_EOF
cat << \SHAR_EOF > tests
abc abc y & abc
abc xbc n - -
abc axc n - -
abc abx n - -
abc xabcy y & abc
abc ababc y & abc
ab*c abc y & abc
ab*bc abc y & abc
ab*bc abbc y & abbc
ab*bc abbbbc y & abbbbc
ab+bc abbc y & abbc
ab+bc abc n - -
ab+bc abq n - -
ab+bc abbbbc y & abbbbc
ab?bc abbc y & abbc
ab?bc abc y & abc
ab?bc abbbbc n - -
ab?c abc y & abc
^abc$ abc y & abc
^abc$ abcc n - -
^abc abcc y & abc
^abc$ aabc n - -
abc$ aabc y & abc
^ abc y &
$ abc y &
a.c abc y & abc
a.c axc y & axc
a.*c axyzc y & axyzc
a.*c axyzd n - -
a[bc]d abc n - -
a[bc]d abd y & abd
a[b-d]e abd n - -
a[b-d]e ace y & ace
a[b-d] aac y & ac
a[-b] a- y & a-
a[b-] a- y & a-
[k] ab n - -
a[b-a] - c - -
a[]b - c - -
a[ - c - -
a] a] y & a]
a[]]b a]b y & a]b
a[^bc]d aed y & aed
a[^bc]d abd n - -
a[^-b]c adc y & adc
a[^-b]c a-c n - -
a[^]b]c a]c n - -
a[^]b]c adc y & adc
ab|cd abc y & ab
ab|cd abcd y & ab
()ef def y &-\1 ef-
()* - c - -
*a - c - -
^* - c - -
$* - c - -
(*)b - c - -
$b b n - -
a\ - c - -
a\(b a(b y &-\1 a(b-
a\(*b ab y & ab
a\(*b a((b y & a((b
a\\b a\b y & a\b
abc) - c - -
(abc - c - -
((a)) abc y &-\1-\2 a-a-a
(a)b(c) abc y &-\1-\2 abc-a-c
a+b+c aabbabc y & abc
a** - c - -
a*? - c - -
(a*)* - c - -
(a*)+ - c - -
(a|)* - c - -
(a*|b)* - c - -
(a+|b)* ab y &-\1 ab-b
(a+|b)+ ab y &-\1 ab-b
(a+|b)? ab y &-\1 a-a
[^ab]* cde y & cde
(^)* - c - -
(ab|)* - c - -
)( - c - -
abc y &
abc n - -
a* y &
abcd abcd y &-\&-\\& abcd-&-\abcd
a(bc)d abcd y \1-\\1-\\\1 bc-\1-\bc
([abc])*d abbbcd y &-\1 abbbcd-c
([abc])*bcd abcd y &-\1 abcd-a
a|b|c|d|e e y & e
(a|b|c|d|e)f ef y &-\1 ef-e
((a*|b))* - c - -
abcd*efg abcdefg y & abcdefg
ab* xabyabbbz y & ab
ab* xayabbbz y & a
(ab|cd)e abcde y &-\1 cde-cd
[abhgefdc]ij hij y & hij
^(ab|cd)e abcde n x\1y xy
(abc|)ef abcdef y &-\1 ef-
(a|b)c*d abcd y &-\1 bcd-b
(ab|ab*)bc abc y &-\1 abc-a
a([bc]*)c* abc y &-\1 abc-bc
a([bc]*)(c*d) abcd y &-\1-\2 abcd-bc-d
a([bc]+)(c*d) abcd y &-\1-\2 abcd-bc-d
a([bc]*)(c+d) abcd y &-\1-\2 abcd-b-cd
a[bcd]*dcdcde adcdcde y & adcdcde
a[bcd]+dcdcde adcdcde n - -
(ab|a)b*c abc y &-\1 abc-ab
((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d
[ -~]* abc y & abc
[ -~ -~]* abc y & abc
[ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~ -~ -~]* abc y & abc
[a-zA-Z_][a-zA-Z0-9_]* alpha y & alpha
^a(bc+|b[eh])g|.h$ abh y &-\1 bh-
(bc+d$|ef*g.|h?i(j|k)) effgz y &-\1-\2 effgz-effgz-
(bc+d$|ef*g.|h?i(j|k)) ij y &-\1-\2 ij-ij-j
(bc+d$|ef*g.|h?i(j|k)) effg n - -
(bc+d$|ef*g.|h?i(j|k)) bcdd n - -
(bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz-
((((((((((a)))))))))) - c - -
(((((((((a))))))))) a y & a
multiple words of text uh-uh n - -
multiple words multiple words, yeah y & multiple words
(.*)c(.*) abcde y &-\1-\2 abcde-ab-de
\((.*), (.*)\) (a, b) y (\2, \1) (b, a)
SHAR_EOF
cat << \SHAR_EOF > timer.c
/*
* Simple timing program for regcomp().
*
* Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not
* derived from licensed software.
*
* Permission is granted to anyone to use this software for any purpose on any
* computer system, and to redistribute it freely, subject to the following
* restrictions:
*
* 1. The author is not responsible for the consequences of use of this
* software, no matter how awful, even if they arise from defects in it.
*
* 2. The origin of this software must not be misrepresented, either by explicit
* claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
*
* Usage: timer ncomp nexec nsub or timer ncomp nexec nsub regexp string [
* answer [ sub ] ]
*
* The second form is for timing repetitions of a single test case. The first
* form's test data is a compiled-in copy of the "tests" file. Ncomp, nexec,
* nsub are how many times to do each regcomp, regexec, and regsub. The way
* to time an operation individually is to do something like "timer 1 50 1".
*/
#include <stdio.h>
struct try {
char *re, *str, *ans, *src, *dst;
} tests[] = {
#include "timer.t.h"
{
NULL, NULL, NULL, NULL, NULL
}
};
#include <regexp.h>
int errreport = 0; /* Report errors via errseen? */
char *errseen = NULL; /* Error message. */
char *progname;
/* ARGSUSED */
main(argc, argv)
int argc;
char *argv[];
{
int ncomp, nexec, nsub;
struct try one;
char dummy[512];
if (argc < 4) {
ncomp = 1;
nexec = 1;
nsub = 1;
} else {
ncomp = atoi(argv[1]);
nexec = atoi(argv[2]);
nsub = atoi(argv[3]);
}
progname = argv[0];
if (argc > 5) {
one.re = argv[4];
one.str = argv[5];
if (argc > 6)
one.ans = argv[6];
else
one.ans = "y";
if (argc > 7) {
one.src = argv[7];
one.dst = "xxx";
} else {
one.src = "x";
one.dst = "x";
}
errreport = 1;
try(one, ncomp, nexec, nsub);
} else
multiple(ncomp, nexec, nsub);
exit(0);
}
void
regerror(s)
char *s;
{
if (errreport)
errseen = s;
else
error(s, "");
}
#ifndef ERRAVAIL
error(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "regexp: ");
fprintf(stderr, s1, s2);
fprintf(stderr, "\n");
exit(1);
}
#endif
int lineno = 0;
multiple(ncomp, nexec, nsub)
int ncomp, nexec, nsub;
{
register int i;
extern char *strchr();
errreport = 1;
for (i = 0; tests[i].re != NULL; i++) {
lineno++;
try(tests[i], ncomp, nexec, nsub);
}
}
try(fields, ncomp, nexec, nsub)
struct try fields;
int ncomp, nexec, nsub;
{
regexp *r;
char dbuf[BUFSIZ];
register int i;
errseen = NULL;
r = regcomp(fields.re);
if (r == NULL) {
if (*fields.ans != 'c')
complain("regcomp failure in `%s'", fields.re);
return;
}
if (*fields.ans == 'c') {
complain("unexpected regcomp success in `%s'", fields.re);
free((char *) r);
return;
}
for (i = ncomp - 1; i > 0; i--) {
free((char *) r);
r = regcomp(fields.re);
}
if (!regexec(r, fields.str)) {
if (*fields.ans != 'n')
complain("regexec failure in `%s'", "");
free((char *) r);
return;
}
if (*fields.ans == 'n') {
complain("unexpected regexec success", "");
free((char *) r);
return;
}
for (i = nexec - 1; i > 0; i--)
(void) regexec(r, fields.str);
errseen = NULL;
for (i = nsub; i > 0; i--)
regsub(r, fields.src, dbuf);
if (errseen != NULL) {
complain("regsub complaint", "");
free((char *) r);
return;
}
if (strcmp(dbuf, fields.dst) != 0)
complain("regsub result `%s' wrong", dbuf);
free((char *) r);
}
complain(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "try: %d: ", lineno);
fprintf(stderr, s1, s2);
fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
}
SHAR_EOF
cat << \SHAR_EOF > try.c
/*
* Simple test program for regexp(3) stuff. Knows about debugging hooks.
*
* Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not
* derived from licensed software.
*
* Permission is granted to anyone to use this software for any purpose on any
* computer system, and to redistribute it freely, subject to the following
* restrictions:
*
* 1. The author is not responsible for the consequences of use of this
* software, no matter how awful, even if they arise from defects in it.
*
* 2. The origin of this software must not be misrepresented, either by explicit
* claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
*
* Usage: try re [string [output [-]]] The re is compiled and dumped, regexeced
* against the string, the result is applied to output using regsub(). The -
* triggers a running narrative from regexec(). Dumping and narrative don't
* happen unless DEBUG.
*
* If there are no arguments, stdin is assumed to be a stream of lines with five
* fields: a r.e., a string to match it against, a result code, a source
* string for regsub, and the proper result. Result codes are 'c' for
* compile failure, 'y' for match success, 'n' for match failure. Field
* separator is tab.
*/
#include <stdio.h>
#include <regexp.h>
#ifdef ERRAVAIL
char *progname;
extern char *mkprogname();
#endif
#ifdef DEBUG
extern int regnarrate;
#endif
char buf[BUFSIZ];
int errreport = 0; /* Report errors via errseen? */
char *errseen = NULL; /* Error message. */
int status = 0; /* Exit status. */
/* ARGSUSED */
main(argc, argv)
int argc;
char *argv[];
{
regexp *r;
int i;
#ifdef ERRAVAIL
progname = mkprogname(argv[0]);
#endif
if (argc == 1) {
multiple();
exit(status);
}
r = regcomp(argv[1]);
if (r == NULL)
error("regcomp failure", "");
#ifdef DEBUG
regdump(r);
if (argc > 4)
regnarrate++;
#endif
if (argc > 2) {
i = regexec(r, argv[2]);
printf("%d", i);
for (i = 1; i < NSUBEXP; i++)
if (r->startp[i] != NULL && r->endp[i] != NULL)
printf(" \\%d", i);
printf("\n");
}
if (argc > 3) {
regsub(r, argv[3], buf);
printf("%s\n", buf);
}
exit(status);
}
void
regerror(s)
char *s;
{
if (errreport)
errseen = s;
else
error(s, "");
}
#ifndef ERRAVAIL
error(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "regexp: ");
fprintf(stderr, s1, s2);
fprintf(stderr, "\n");
exit(1);
}
#endif
int lineno;
regexp badregexp; /* Implicit init to 0. */
multiple()
{
char rbuf[BUFSIZ];
char *field[5];
char *scan;
int i;
regexp *r;
extern char *strchr();
errreport = 1;
lineno = 0;
while (fgets(rbuf, sizeof(rbuf), stdin) != NULL) {
rbuf[strlen(rbuf) - 1] = '\0'; /* Dispense with \n. */
lineno++;
scan = rbuf;
for (i = 0; i < 5; i++) {
field[i] = scan;
if (field[i] == NULL) {
complain("bad testfile format", "");
exit(1);
}
scan = strchr(scan, '\t');
if (scan != NULL)
*scan++ = '\0';
}
try(field);
}
/* And finish up with some internal testing... */
lineno = 9990;
errseen = NULL;
if (regcomp((char *) NULL) != NULL || errseen == NULL)
complain("regcomp(NULL) doesn't complain", "");
lineno = 9991;
errseen = NULL;
if (regexec((regexp *) NULL, "foo") || errseen == NULL)
complain("regexec(NULL, ...) doesn't complain", "");
lineno = 9992;
r = regcomp("foo");
if (r == NULL) {
complain("regcomp(\"foo\") fails", "");
return;
}
lineno = 9993;
errseen = NULL;
if (regexec(r, (char *) NULL) || errseen == NULL)
complain("regexec(..., NULL) doesn't complain", "");
lineno = 9994;
errseen = NULL;
regsub((regexp *) NULL, "foo", rbuf);
if (errseen == NULL)
complain("regsub(NULL, ..., ...) doesn't complain", "");
lineno = 9995;
errseen = NULL;
regsub(r, (char *) NULL, rbuf);
if (errseen == NULL)
complain("regsub(..., NULL, ...) doesn't complain", "");
lineno = 9996;
errseen = NULL;
regsub(r, "foo", (char *) NULL);
if (errseen == NULL)
complain("regsub(..., ..., NULL) doesn't complain", "");
lineno = 9997;
errseen = NULL;
if (regexec(&badregexp, "foo") || errseen == NULL)
complain("regexec(nonsense, ...) doesn't complain", "");
lineno = 9998;
errseen = NULL;
regsub(&badregexp, "foo", rbuf);
if (errseen == NULL)
complain("regsub(nonsense, ..., ...) doesn't complain", "");
}
try(fields)
char **fields;
{
regexp *r;
char dbuf[BUFSIZ];
errseen = NULL;
r = regcomp(fields[0]);
if (r == NULL) {
if (*fields[2] != 'c')
complain("regcomp failure in `%s'", fields[0]);
return;
}
if (*fields[2] == 'c') {
complain("unexpected regcomp success in `%s'", fields[0]);
free((char *) r);
return;
}
if (!regexec(r, fields[1])) {
if (*fields[2] != 'n')
complain("regexec failure in `%s'", "");
free((char *) r);
return;
}
if (*fields[2] == 'n') {
complain("unexpected regexec success", "");
free((char *) r);
return;
}
errseen = NULL;
regsub(r, fields[3], dbuf);
if (errseen != NULL) {
complain("regsub complaint", "");
free((char *) r);
return;
}
if (strcmp(dbuf, fields[4]) != 0)
complain("regsub result `%s' wrong", dbuf);
free((char *) r);
}
complain(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "try: %d: ", lineno);
fprintf(stderr, s1, s2);
fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
status = 1;
}
SHAR_EOF
# End of shell archive
exit 0
--
Bob Page, U of Lowell CS Dept. page@swan.ulowell.edu ulowell!page
Have five nice days.